home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
TeX 1995 July
/
TeX CD-ROM July 1995 (Disc 1)(Walnut Creek)(1995).ISO
/
biblio
/
bibtex
/
utils
/
bibclean
/
match.c
< prev
next >
Wrap
C/C++ Source or Header
|
1992-11-21
|
10KB
|
394 lines
/***********************************************************************
==========
BACKGROUND
==========
This file contains an Implementation of limited regular-expression
pattern matching code. The pattern syntax is simpler, more limited,
and different from normal regular-expression pattern matching syntax.
It is described in more detail below.
The motivation for this new code is that I found considerable
inconsistency in the matching behavior between versions of either
re_comp()/re_exec() or compile()/step() on these systems
DECstation 3100
IBM 3090
IBM PS/2
IBM RS/6000 AIX 3.2
NeXT Mach 3.0
Silicon Graphics IRIX 4.0
Stardent OS 2.2
Sun SPARC
That makes use of those regular-expression pattern matching unreliable
across systems.
One possible solution would be to use the GNU re_comp() and re_exec()
from the regexp distribution on prep.ai.mit.edu (as of writing,
pub/gnu/regex-0.11.*). However, that code is large (5000+ lines), and
its installation uses configuration facilities that only work under
some variants of UNIX, and are completely useless on other operating
systems.
By contrast, the pattern matching code here is quite adequate for
bibclean's needs, and can be expressed in fewer than 140 lines. In
addition, it provides special handling of TeX control sequences and
braces that would be rather awkward to express in conventional
regular-expression syntax.
If the symbol TEST is defined at compile time, a main program will be
included that can be used for testing patterns supplied from stdin.
==============
PATTERN SYNTAX
==============
The string values to be pattern-matched are tab-free single-line
values delimited by quotation marks.
The patterns are represented by the following special markers:
a exactly one letter
A one or more letters
d exactly one digit
D one or more digits
w exactly one word (one or more letters and digits)
W one or more space-separated words, beginning and ending
with a word
X one or more special-separated words, beginning and ending
with a word
. one special character (see SPECIAL_CHARS defined below)
: one or more special characters
<space> one or more spaces
\x exactly one x (x is an character)
x exactly the character x (x is anything but aAdDwW.:<space>\)
Special characters are a subset of punctuation characters that are
typically used in values.
Note the \<space> represents a single literal space, \\ a single
literal backslash, \a the letter a, \A the letter A, \d the letter d,
\D the letter D, and so on. Remember to double all backslashes in C
strings: \a must be entered as \\a, and "and" as "\\an\\d".
Each pattern is matched against the entire string and must match
successfully for a YES return from match_pattern(). Consequently,
there is no need for an analogue of ^ and $ in full regular
expressions. Neither is there provision for matching on arbitrary
sets of characters. Instead, fixed sets of characters are provided
(conventional regular-expression equivalents are shown in
parentheses):
digits ([0-9]),
alphanumerics ([A-Za-z0-9]),
space ([ \t\f\r\n\v]), and
special ([][" !#()*+,-./:;?~])
In addition, TeX control sequences of the form
\<one-special-character> or \<letter-sequence> in the string are
ignored in the match, together with any following whitespace.
Braces are also ignored, but not whitespace following them.
Thus "{TR\slash A87}" matches the patterns "AD" and "W", and
"{TR A\slash 87}" matches the patterns "A AD" and "A W".
[11-Nov-1992]
***********************************************************************/
#include "os.h"
#include "xstdlib.h"
#include "xstring.h"
#include "xctype.h"
RCSID("$Id: match.c,v 1.2 1992/11/22 17:44:32 beebe Exp beebe $")
/* $Log: match.c,v $
* Revision 1.2 1992/11/22 17:44:32 beebe
* Change type of match_patterns from int to YESorNO for version 2.05 bibclean.
*
* Revision 1.1 1992/11/15 08:20:05 beebe
* Initial revision
* */
#define NEW_STYLE (__cplusplus || __STDC__ || c_plusplus)
#if NEW_STYLE
#define VOID void
#else /* K&R style */
#define VOID
#endif /* NEW_STYLE */
#if NEW_STYLE
typedef enum { NO = 0, YES = 1 } YESorNO;
#else /* K&R style */
#define NO 0 /* must be FALSE (zero) */
#define YES 1 /* must be TRUE (non-zero) */
typedef int YESorNO;
#endif /* NEW_STYLE */
#include "match.h"
#ifndef EXIT_SUCCESS
#define EXIT_SUCCESS 0
#endif
#define SPECIAL_CHARS " !#()*+,-./:;?[]~"
#define isspecial(c) (strchr(SPECIAL_CHARS,(c)) != (char*)NULL)
static const char *next_s ARGS((const char *s_));
#if NEW_STYLE
YESorNO
match_pattern(const char *s, const char *pattern)
#else /* K&R style */
YESorNO
match_pattern(s,pattern)
const char *s;
const char *pattern;
#endif /* NEW_STYLE */
{
s = next_s(s-1);
for ( ; *pattern; ++pattern)
{
switch(*pattern)
{
case 'a': /* single letter */
if (!isalpha(*s))
return (NO);
s = next_s(s);
break;
case 'w': /* one word (letters and digits) */
if (!isalnum(*s))
return (NO);
while (isalnum(*s))
s = next_s(s);
break;
case 'A': /* one or more letters */
if (!isalpha(*s))
return (NO);
while (isalpha(*s))
s = next_s(s);
break;
case 'd':
if (!isdigit(*s)) /* single digit */
return (NO);
s = next_s(s);
break;
case 'D': /* one or more digits */
if (!isdigit(*s))
return (NO);
while (isdigit(*s))
s = next_s(s);
break;
case 'W': /* one or more space-separated words */
if (!isalnum(*s))
return (NO);
while (isalnum(*s)) /* parse first word */
s = next_s(s);
for (;;)
{
if (!isspace(*s))
break;
while (isspace(*s)) /* parse separators */
s = next_s(s);
while (isalnum(*s)) /* parse another word */
s = next_s(s);
}
break;
case 'X': /* one or more special-separated words */
if (!isalnum(*s))
return (NO);
while (isalnum(*s)) /* parse first word */
s = next_s(s);
for (;;)
{
if (!isspecial(*s))
break;
while (isspecial(*s)) /* parse separators */
s = next_s(s);
while (isalnum(*s)) /* parse another word */
s = next_s(s);
}
break;
case ' ': /* one or more whitespace characters */
if (!isspace(*s))
return (NO);
while (isspace(*s))
s = next_s(s);
break;
case '.': /* exactly one special character */
if (!isspecial(*s))
return (NO);
break;
case ':': /* one or more special characters */
if (!isspecial(*s))
return (NO);
while (isspecial(*s))
s = next_s(s);
break;
case '\\': /* literal next character */
pattern++;
/* fall through to exact match test */
default: /* anything else: exact match */
if (*pattern != *s)
return(NO);
s = next_s(s);
} /* end switch */
} /* end for (; ;) */
return (*s == '\0' ? YES : NO); /* YES if reached end of string */
}
#if NEW_STYLE
static const char *
next_s(const char *s)
#else /* K&R style */
static const char *
next_s(s)
const char *s;
#endif /* NEW_STYLE */
{
/* find next position in s, ignoring braces and ignoring TeX control
sequences and any space that follows them */
for (++s; (*s == '\\') || (*s == '{') || (*s == '}') ; )
{
switch (*s)
{
case '\\': /* TeX control sequence */
++s; /* look at next character */
if (isalpha(*s)) /* \<one-or-more-letters> */
{
while (isalpha(*s))
++s;
}
else /* \<non-letter> */
++s;
while (isspace(*s)) /* advance over trailing whitespace */
++s; /* since TeX does too */
break;
case '{':
case '}':
++s;
break;
default:
return (s);
} /* end switch */
} /* end for */
return (s);
}
#ifdef TEST
#define MAXLINE 256
#define NO_WARNING (const char *)NULL
MATCH_PATTERN year_patterns[] =
{
{"\"DDDD\"", NO_WARNING},
{"\"DDDD,WDDDD\"", NO_WARNING},
{"\"DDDD, DDDD, DDDD\"", NO_WARNING},
{(const char*)NULL, NO_WARNING},
};
MATCH_PATTERN number_patterns[] =
{
{"\"D\"", "23"},
{"\"A AD\"", "PN LPS5001"},
{"\"A D(D)\"", "RJ 34(49)"},
{"\"A D\"", "XNSS 288811"},
{"\"A D\\.D\"", "Version 3.20"},
{"\"A-A-D-D\"", "UMIAC-TR-89-11"},
{"\"A-A-D\"", "CS-TR-2189"},
{"\"A-A-D\\.D\"", "CS-TR-21.7"},
{"\"A-AD-D\"", "TN-K\\slash 27-70"},
{"\"A-D D\"", "PB-251 845"},
{"\"A-D-D\"", "ANL-30-74"},
{"\"A-D\"", "TR-2189"},
{"\"AD-D-D\"", "GG24-3611-00"},
{"\"AD-D\"", "SP43-29"},
{"\"AD\"", "LPS0064"},
{"\"A\\#D-D\"", "TR\\#89-24 ????"},
{"\"D \\an\\d D\"", "11 and 12"},
{"\"D+D\"", "3+4"},
{"\"D-D\"", "23-27"},
{"\"D/D\"", "23/27"},
{"\"DA\"", "23A"},
{"\"D\\.D\"", "3.4"},
{"\"W-W W\"", "AERE-R 12329"},
{"\"W-W-WW-W\"", "OSU-CISRC-4\\slash 87-TR9"},
{"\"W\"", "Computer Science Report 100"},
{"\"X\"", "TR/AB/3-43.7-3/AB"},
{(const char*)NULL, NO_WARNING},
};
int main ARGS((int argc,char* argv[]));
static void process ARGS((const char *line_, MATCH_PATTERN patterns_[]));
#if NEW_STYLE
int
main(int argc, char* argv[])
#else /* K&R style */
int
main(argc,argv)
int argc;
char* argv[];
#endif /* NEW_STYLE */
{
char line[MAXLINE];
while (fgets(line,MAXLINE,stdin) != (char*)NULL)
{
char *p = strchr(line,'\n');
if (p != (char *)NULL)
*p = '\0';
process(line,number_patterns);
}
exit (EXIT_SUCCESS);
return (EXIT_SUCCESS);
}
#if NEW_STYLE
static void
process(const char *line, MATCH_PATTERN patterns[])
#else /* K&R style */
static void
process(line,patterns)
const char *line;
MATCH_PATTERN patterns[];
#endif /* NEW_STYLE */
{
int k;
for (k = 0; patterns[k].pattern != (const char*)NULL; ++k)
{
if (match_pattern(line,patterns[k].pattern) == YES)
{
if (patterns[k].message != NO_WARNING)
printf("%%%% [%-24s]: %s\n", line, patterns[k].message);
return;
}
}
printf("?? [%-24s]: Illegal value\n", line);
}
#endif /* TEST */